*****************************************************************
* Replication directory for                                   ***
* Prime locations                                             ***
* by Gabriel M. Ahlfeldt, Thilo N.H. Albers, Kristian Behrens ***
* Published in American Economic Review: Insights             ***
*****************************************************************
* 01/2025
* Stata
version 17.0

* This do file calls Python geopandas to create Global Cities prime location shapefils for the GitHub toolkit
* Execution requires a Python working environment with packages mentioned at the beginning of the Python code
* Should the code not run through in Stata, the part between "python:" and "end" can be executed in Python
* In this case, you must manually set the root directory in line 31. 

* Create directories
	capture mkdir "$shapeoutput/GlobalCities"
	capture mkdir "$shapeoutput/Global"
	
	cd "$root"

* Generate shapefiles by city **************************************************
python: 

import os
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon
from shapely.errors import TopologicalError
import zipfile  # For creating zip files

root_directory = os.getcwd()

# File path for the list of identifiers in metrolist125.csv (relative to root directory)
identifier_file = os.path.join(root_directory, r'_Data/125 GLOBAL CITIES/METRO_LEVEL_COVARIATES/metrolist125.csv')

# Read the list of identifiers from the CSV file, using 'metro_id' as the column for identifiers
identifier_df = pd.read_csv(identifier_file)
identifiers = identifier_df['metro_id'].tolist()  # Using 'metro_id' as the identifier

# File paths (relative to root directory)
input_folder = os.path.join(root_directory, r'_Work/TEMP/125CitiesPls')
output_folder = os.path.join(root_directory, r'_Work/Output/Shapes/GlobalCities')

# Define the projection (EPSG:4326 corresponds to WGS 84 in decimal degrees)
crs = "EPSG:4326"

# Function to check if all coordinates are present and valid
def is_valid_coordinates(row):
    return all(pd.notna([row['lon_UL'], row['lat_UL'], row['lon_LR'], row['lat_LR']]))

# Function to create a polygon from coordinates
def create_polygon(row):
    try:
        return Polygon([
            (row['lon_UL'], row['lat_UL']),  # Upper Left
            (row['lon_LR'], row['lat_UL']),  # Upper Right
            (row['lon_LR'], row['lat_LR']),  # Lower Right
            (row['lon_UL'], row['lat_LR']),  # Lower Left
            (row['lon_UL'], row['lat_UL'])   # Close the polygon
        ])
    except (ValueError, TopologicalError) as e:
        print(f"Error creating polygon for row: {row['PLID']}, skipping. Error: {e}")
        return None

# Loop over each identifier and process files
for identifier in identifiers:
    identifier_str = str(identifier)  # Ensure identifier is a string
    
    # Construct the input file path for the grid data (relative to input folder)
    input_file = os.path.join(input_folder, f'grid125_PL_output_{identifier_str}.csv')
    
    if not os.path.exists(input_file):
        print(f"File not found: {input_file}")
        continue  # Skip to the next identifier if not found
    
    # Load the grid data
    df = pd.read_csv(input_file)

    # Filter rows with invalid or missing coordinates
    df = df[df.apply(is_valid_coordinates, axis=1)]
    
    # Create a Polygon for each row and handle errors gracefully
    df['geometry'] = df.apply(create_polygon, axis=1)

    # Drop rows where geometry creation failed (None values)
    df = df[df['geometry'].notnull()]

    # Convert the DataFrame to a GeoDataFrame in the original CRS (EPSG:4326)
    gdf = gpd.GeoDataFrame(df, geometry='geometry', crs=crs)
    
    # Reproject the GeoDataFrame to a UTM coordinate system (meters-based) for accurate buffering
    utm_gdf = gdf.to_crs(gdf.estimate_utm_crs())

    # Create a 10-meter buffer around each polygon
    utm_gdf['geometry'] = utm_gdf.buffer(10)

    # Reproject back to EPSG:4326 after buffering
    gdf_buffered = utm_gdf.to_crs(crs)

    # Group the grid cells by PLID and dissolve them into single polygons for each PLID
    gdf_buffered = gdf_buffered.dissolve(by='PLID')

    # Construct the output shapefile path for the identifier (relative to output folder)
    shapefile_path = os.path.join(output_folder, f'PL_{identifier_str}.shp')
    
    # Save the dissolved GeoDataFrame as a shapefile
    gdf_buffered.to_file(shapefile_path)
    print(f"Shapefile saved at: {shapefile_path}")

    # Create a zip file for the shapefile and its associated files
    zip_filename = os.path.join(output_folder, f'shape_{identifier_str}.zip')  # Use identifier in zip name
    with zipfile.ZipFile(zip_filename, 'w') as zipf:
        for ext in ['.shp', '.shx', '.dbf', '.prj', '.cpg']:
            shapefile_component = shapefile_path.replace('.shp', ext)
            if os.path.exists(shapefile_component):
                zipf.write(shapefile_component, arcname=os.path.basename(shapefile_component))
                print(f"Added {shapefile_component} to {zip_filename}")
    
    print(f"Zip file created: {zip_filename}")



end

* Combine all shapefiles into one **********************************************

python:

import os
import geopandas as gpd
import pandas as pd
import zipfile  # For creating zip files

# Set the root directory to the Stata working directory
root_directory = os.getcwd()

# Folder where the individual shapefiles are saved (relative to root directory)
output_folder = os.path.join(root_directory, r'_Work/Output/Shapes/GlobalCities')

# Folder where combined shapefile will be saved (relative to root directory)
all_folder = os.path.join(root_directory, r'_Work/Output/Shapes/Global')

# File path for the combined shapefile (relative to root directory)
combined_shapefile = os.path.join(all_folder, 'PL_all.shp')

# Delete the old combined shapefile, if it exists, to avoid contamination
if os.path.exists(combined_shapefile):
    for ext in ['.shp', '.shx', '.dbf', '.cpg', '.prj']:
        old_file = combined_shapefile.replace('.shp', ext)
        if os.path.exists(old_file):
            os.remove(old_file)
            print(f"Deleted old file: {old_file}")

# List to store GeoDataFrames
gdf_list = []

# Ensure to clear any leftover data in memory
del gdf_list  # Remove any previous instances of gdf_list from memory
gdf_list = []  # Reinitialize to be sure

# Loop through the shapefiles in the output folder
for filename in os.listdir(output_folder):
    if filename.endswith(".shp") and filename.startswith("PL_"):
        # Full path to the shapefile
        shapefile_path = os.path.join(output_folder, filename)

        # Load the shapefile as a GeoDataFrame
        print(f"Loading shapefile: {shapefile_path}")
        gdf = gpd.read_file(shapefile_path)
        
        # Append the GeoDataFrame to the list
        gdf_list.append(gdf)

# Concatenate all GeoDataFrames into one
if gdf_list:
    combined_gdf = gpd.GeoDataFrame(pd.concat(gdf_list, ignore_index=True))

    # Save the combined GeoDataFrame as a new shapefile
    combined_gdf.to_file(combined_shapefile)
    print(f"Combined shapefile saved at: {combined_shapefile}")

    # Create a zip file for the combined shapefile
    zip_filename = os.path.join(all_folder, 'shape_all.zip')
    with zipfile.ZipFile(zip_filename, 'w') as zipf:
        for ext in ['.shp', '.shx', '.dbf', '.cpg', '.prj']:
            shapefile_component = combined_shapefile.replace('.shp', ext)
            if os.path.exists(shapefile_component):
                zipf.write(shapefile_component, arcname=os.path.basename(shapefile_component))
                print(f"Added {shapefile_component} to {zip_filename}")
    
    print(f"Zip file created: {zip_filename}")
else:
    print("No shapefiles found to combine.")


end

* Script ends
